/* OPTIONS OBS=5000  NOREPLACE ; */   

/* 
   This program imports the CART imputation .csv file for all industries in the 2007 Census of Manufactures
*/


%include "ASMimplibs.sas";


            data WORK.imputes_all_inds07                           ;
            %let _EFIERR_ = 0; /* set the ERROR detection macro variable */
            infile "imputes_all_inds07.csv" delimiter = ',' MISSOVER DSD lrecl=32767 firstobs=2 ;
               informat number $6. ;
               informat NAICS_NEW_6 $6. ;
               informat TAE best32. ;
               informat CM best32.;
               informat TE best32.;
               informat PH best32. ;
               informat SW best32. ;
               informat TVS best32. ;
               informat dinv best32. ;
               informat energycmratio best32.;
               informat wwswratio best32. ;
               informat impsetnum best32. ;

               format number $6. ;
               format NAICS_NEW_6 $6. ;
               format TAE best12. ;
               format CM best12. ;
               format TE best12. ;
               format PH best12. ;
               format SW best12. ;
               format TVS best12. ;
               format dinv best12. ;
               format energycmratio best12. ;
               format wwswratio best12. ;
               format impsetnum best12. ;
            input
                        number $
			NAICS_NEW_6 $
                        TAE
                        CM
                        TE
                        PH
			SW
                        TVS
                        dinv
                        energycmratio
                        wwswratio
                        impsetnum
            ;
            if _ERROR_ then call symputx('_EFIERR_',1);  /* set ERROR detection macro variable */
            run;




   data imputes_all_inds07 (keep = NUMBER_NUM NAICS_NEW_6 TVS dinv TE PH SW TAE CM energycmratio  wwswratio  impsetnum ); 
   set imputes_all_inds07; 
   NUMBER_NUM = input(NUMBER,6.);
  run;


  PROC DATASETS  LIBRARY=WORK;
   MODIFY imputes_all_inds07;
   rename NUMBER_NUM = number;
  RUN;

  PROC SORT DATA=imputes_all_inds07;
   by NUMBER;
  RUN;


  /** Merge ids **/

 data gooddata_ids_all_inds07 (keep = number  survu_id firmid  ); 
  set allcmf.gooddata_ids_all_inds07 ;
 run;
 
 proc sort data=gooddata_ids_all_inds07; by number; run;

 /* NOTE: Some industries may be missing from CART imputes because 
    there were not 2 or more variables with missing data for those industries. 
 */

  data imputes_all_inds07_postmerge in_ids_not_in_imputes07;
   merge gooddata_ids_all_inds07 (in=inids) imputes_all_inds07 (in=inimp); 
  by NUMBER;
  if inids and inimp then output imputes_all_inds07_postmerge;
  if inids and inimp=0 then output in_ids_not_in_imputes07;
  run;

  proc datasets library=work;
   modify imputes_all_inds07_postmerge ;
    rename impsetnum=_IMPUTE_;
  run;

  /*** Check the industries of the plants in the original dataset 
       that aren't in the CART imputations dataset.
       Want to make sure all of these are industries with < 2 variables
       with imputes/missing values.
  ***/
  data gooddata_all_inds07 (keep = survu_id NAICS_NEW_6);
   set allcmf.gooddata_all_inds07;

  proc sort data=in_ids_not_in_imputes07; by survu_id; run;
  proc sort data=gooddata_all_inds07; by survu_id; run;
  
  data in_ids_not_in_imputes07;
   merge in_ids_not_in_imputes07 (in=inid) gooddata_all_inds07 (in=indata);
   by survu_id;
   if inid and indata;
  run;

  proc freq data=in_ids_not_in_imputes07;
    tables NAICS_NEW_6;
   title "industries of plants in ID data but not in CART imputes dataset";
  run;

  /*** For industries/plants in the original dataset 
       that aren't in the CART imputations dataset,
       get the original observations (which might include 
       Census Bureau imputations for no more than 1 variable).
  ***/

  data survuids_not_in_imputes07 (keep = survu_id  );
   set in_ids_not_in_imputes07;
  run;
  
  data cmf07_nonAR (keep = survu_id firmid NAICS_NEW_6 TVS TE PH SW TAE CM dinv energycmratio wwswratio);
   set allcmf.cmf07asm06;
    dinv = tie-tib;
    energycmratio = sum(ee,cf)/cm;
    wwswratio = ww/sw;
  run;

  proc sort data=survuids_not_in_imputes07; by survu_id; run;
  proc sort data=cmf07_nonAR; by survu_id; run;

  data cmf_data_not_in_imputes07 in_surv_not_in_cmf;
   merge survuids_not_in_imputes07 (in=insurv) cmf07_nonAR (in=incmf);
   by survu_id;
   if insurv and incmf then output cmf_data_not_in_imputes07;
   else if insurv then output in_surv_not_in_cmf;  * this dataset should be empty;
  run;
 
    
  /*** Create 100 duplicates of these
       observations so that we can include them in the 
       TFPR IQR calculations with the 100 CART implicates.
  ***/ 
  data cmf_data_not_in_imputes07;
   set cmf_data_not_in_imputes07;
    do _IMPUTE_ = 1 to 100;
     output cmf_data_not_in_imputes07;
    end;
  run;

   /** Stack the industries with non-imputed data with the CART-completed dataset,
       add the year variable and save the dataset.
   ***/

    data allcmf.imputes_all_inds07;
     set cmf_data_not_in_imputes07
         imputes_all_inds07_postmerge;
      year=2007;
    run;




